# Load required packages and data
library(tidyverse)
library(pander)
library(fitdistrplus)
library(AER)
dat <- readRDS("t_test_data_sbs_pt01_10sub_10clust_10000sims.rds")

pval_b1_model <- unlist(dat[1,])
est_sb2 <- unlist(dat[2,]) # let's assume anything less than 1e-6 is the weird ones
tiny_est <- est_sb2 < 1e-6
resid_var_model <- unlist(dat[3,])
MSB <- unlist(dat[5,])
MSE <- unlist(dat[6,])
clustvar0 <- unlist(dat[7,])
clustvar1 <- unlist(dat[8,])
ftest_pval <- unlist(dat[9,])
pval_tt_eq_var <- unlist(dat[13,])
pval_tt_uneq_var <- unlist(dat[14,])
df_tt_uneq_var <- unlist(dat[15,])
beta0 <- unlist(dat[17,])
beta0_se <- unlist(dat[18,])
beta1 <- unlist(dat[19,])
beta1_se <- unlist(dat[20,])
plotdat <- cbind.data.frame(tiny_est, pval_b1_model, est_sb2, resid_var_model, #modelNLME,
                            MSB, MSE,
                            clustvar0, clustvar1, ftest_pval,# ttest_eq_var, ttest_uneq_var,
                            #clustmeans,
                            pval_tt_eq_var, pval_tt_uneq_var, df_tt_uneq_var,
                            beta0, beta0_se, beta1, beta1_se)#, test)
plotdat$icc <- MSB / (MSB + MSE)

Data-generating mechanism

Gathering data on how a ‘naive’ NLME model estimates those parameters, as well as what equal- and unequal-variance t-tests show.

When we get the tiny \(\sigma_b^2\) estimates, ratio of MSB to MSE from original dataset is small

Mean square within clusters (clusters \(i = 1, 2, ..., k\); subjects \(j = 1, 2, ..., n\) in each cluster): \[ MSE = \frac{1}{k(n-1)} \sum_i \sum_j (y_{ij} - \bar{y}_{i.} )^2 \] Mean square between clusters: \[ MSB = \frac{1}{k-1} \sum_i \sum_j (\bar{y}_{i.} - \bar{y}_{..} )^2 \]

I looked at a lot of relationships between variables, and the ratio \(MSB / MSE\) was the one that popped out as a clear predictor of a tiny estimate.

cutoff_MSBMSE <- min(MSB[tiny_est==F] / MSE[tiny_est==F])
cutoff_ICC <- min(plotdat$icc[tiny_est==F])

ggplot(data = plotdat) +
  geom_histogram(aes(x = MSB/MSE), color = "black", fill = "grey") +
  labs(title = "Ratio around .1 highly determinative of a tiny estimate") +
  facet_grid(rows = vars(tiny_est), labeller = label_both)

ggplot(data = plotdat) +
  geom_histogram(aes(x = MSB), color = "black", fill = "grey") +
  labs(title = "MSB alone is less predictive") +
  facet_grid(rows = vars(tiny_est), labeller = label_both)

ggplot(data = plotdat) +
  geom_point(aes(x = MSB, y = MSE, color = tiny_est), alpha = .2, stroke = 0) +
  labs(title = "Ratio around .1 highly determinative of a tiny estimate")

ggplot(data = plotdat) +
  geom_point(aes(x = MSB, y = MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_abline(aes(slope = 1/cutoff_MSBMSE, intercept = 0)) +
  labs(title = "More variability in that finding among the tiny estimates") +
  facet_grid(rows = vars(tiny_est), labeller = label_both)

ggplot(data = plotdat) +
  labs(title = "No change in the pattern as MSE varies") +
  geom_point(aes(x = MSE, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_hline(aes(yintercept = cutoff_MSBMSE))

ggplot(data = plotdat) +
  labs(title = "No change in the pattern as MSB varies") +
  geom_point(aes(x = MSB, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_hline(aes(yintercept = cutoff_MSBMSE))

ggplot(data = plotdat) +
  geom_point(aes(x = clustvar0/clustvar1, y = MSB/MSE, color = tiny_est), alpha = .3, stroke = 0) + xlim(c(0,4))

  labs(title = "Pattern seems invariant to ratio of between-cluster variances")
## $title
## [1] "Pattern seems invariant to ratio of between-cluster variances"
## 
## attr(,"class")
## [1] "labels"
ggplot(data = plotdat) +
  geom_point(aes(y = MSB/MSE, x = clustvar0 - clustvar1, color = tiny_est), alpha = .3, stroke = 0) +
  labs(title = "Pattern seems invariant to difference in between-cluster variances")

ggplot(data = plotdat) +
  geom_point(aes(x = MSE, y = icc, color = tiny_est), alpha = .2, stroke = 0) +
  labs(title = "Using the ICC instead of the ratio might also work, results are the same") +
  geom_hline(aes(yintercept = cutoff_ICC)) +
  facet_grid(rows = vars(tiny_est), labeller = label_both)

ggplot(data = plotdat) +
  geom_point(aes(x = MSE, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) +
  labs(title = "Using the ratio") +
  geom_hline(aes(yintercept = cutoff_MSBMSE)) +
  facet_grid(rows = vars(tiny_est), labeller = label_both)

sum((MSB[tiny_est==T] / MSE[tiny_est==T]) > cutoff_MSBMSE)
## [1] 628
sum((plotdat$icc[tiny_est==T]) > cutoff_ICC)
## [1] 628

What to make of this? Just a signal-to-noise issue?

What about the cases that violate that boundary line?

First, find some wild outliers and examine them…

library(ggplot2)
library(pander)
outliers <- plotdat[ MSB/MSE > cutoff_MSBMSE+.05 & tiny_est == T, ]

outindx <- c(779, 4719, 6551, 7571)
plotdat$outlier <- FALSE
plotdat$outlier[outindx] <- TRUE

ggplot(data = plotdat) +
  geom_point(aes(x = MSE, y = MSB/MSE, color = tiny_est, stroke = outlier, alpha = outlier)) +
  labs(title = "Looking at these four most extreme outliers...") +
  geom_hline(aes(yintercept = cutoff_MSBMSE))

pander(outliers[,c(4:7,13,15)]) # Clue!
  resid_var_model MSB MSE clustvar0 beta0 beta1
779 0.9266 0.1417 0.9401 0.04995 -0.3351 0.5034
4719 0.9447 0.1444 0.9594 0.03254 0.1923 -0.5114
6551 0.9413 0.1563 0.9545 0.06521 -0.3572 0.5499
7571 0.8346 0.1555 0.856 0.05251 0.4466 -0.6063
ggplot(data = plotdat[tiny_est==T,]) +
  geom_point(aes(x = MSE, y = MSB/MSE, color = abs(beta0 - beta1)), stroke = 0, alpha = .5) +
  labs(title = "Outliers more common with large differences in model beta coefs...") +
  geom_hline(aes(yintercept = cutoff_MSBMSE))

ggplot(data = plotdat[tiny_est==T,]) +
  geom_point(aes(x = MSE, y = MSB/MSE, stroke = outlier, color = abs(beta0 - beta1) < .45), alpha = .5) +
  labs(title = "... easier to see when discretized.") +
  geom_hline(aes(yintercept = cutoff_MSBMSE))

# Something with the underlying data sets?
# outlier_datasets <- dat[16,outindx]
# ggplot(data = outlier_datasets[[1]]) + geom_point(aes(x = clustid, y = linpred, color = arm_factor))

Still, outliers not fully explained; this isn’t a perfect predictor of the outliers. Other looks at the outlier data sets haven’t turned up any more clues (yet).

What about the p-value / t-test data?

ggplot(data = plotdat) +
  geom_histogram(aes(x = pval_tt_uneq_var), color = "black", fill = "grey") +
  facet_grid(rows = vars(tiny_est), labeller = label_both) +
  xlab("p-value for welch t-test, allows unequal variances (results the same with eq vars)") +
  labs(title = "Small p-values on a t-test correlated with tiny estimates; makes sense, more power")

ggplot(data = plotdat) +
  geom_histogram(aes(x = pval_b1_model), color = "black", fill = "grey") +
  facet_grid(rows = vars(tiny_est), labeller = label_both) +
  xlab("p-value from model") +
  labs(title = "Model p-values are 'missing' low p-values, all in cases of tiny estimates")

sum(pval_tt_uneq_var < .05) / length(pval_tt_uneq_var)
## [1] 0.0461
sum(pval_b1_model < .05) / length(pval_b1_model)
## [1] 0.0331

The difference in TIE rates is being driven by the tiny estimates

Let’s find some cases here…

lostcases <- plotdat[plotdat$pval_tt_uneq_var < .05 & plotdat$pval_b1_model > .05 & plotdat$tiny_est == T,] # 136 cases
head(lostcases)
##     tiny_est pval_b1_model      est_sb2 resid_var_model        MSB
## 49      TRUE    0.07582983 2.869309e-09       1.1583300 0.10639340
## 253     TRUE    0.06025526 3.539340e-09       0.8010241 0.07998462
## 259     TRUE    0.05422782 1.309590e-08       1.0300760 0.11488043
## 372     TRUE    0.06706779 1.436369e-09       1.0424600 0.08681763
## 417     TRUE    0.05515801 4.698646e-09       1.1309040 0.12026116
## 426     TRUE    0.08530929 1.776377e-09       1.1172840 0.09358004
##           MSE  clustvar0  clustvar1 ftest_pval pval_tt_eq_var
## 49  1.1846991 0.05939007 0.11953822 0.31213910     0.04596725
## 253 0.8145855 0.05803124 0.07505081 0.70786249     0.04113355
## 259 1.0360896 0.09756757 0.09641971 0.98622407     0.04795239
## 372 1.0770618 0.11066439 0.02862313 0.05656667     0.02831879
## 417 1.1434710 0.07187512 0.12917239 0.39562969     0.04320518
## 426 1.1508116 0.04952919 0.10687307 0.26739459     0.04308825
##     pval_tt_uneq_var df_tt_uneq_var      beta0   beta0_se      beta1
## 49        0.04759610       16.17248 -0.2860586 0.10762575  0.2867473
## 253       0.04136019       17.71034  0.1214233 0.08949995 -0.2537595
## 259       0.04795290       17.99937 -0.2756375 0.10149265  0.2955817
## 372       0.03256530       13.36374  0.1285441 0.10210091 -0.2814062
## 417       0.04435176       16.64784 -0.1940349 0.10634399  0.3083940
## 426       0.04498715       15.86703 -0.1546564 0.10570167  0.2721765
##      beta1_se        icc outlier
## 49  0.1522058 0.08240572   FALSE
## 253 0.1265720 0.08941124   FALSE
## 259 0.1435323 0.09981183   FALSE
## 372 0.1443925 0.07459332   FALSE
## 417 0.1503931 0.09516348   FALSE
## 426 0.1494847 0.07520144   FALSE
lostindx <- as.numeric(rownames(lostcases))
plotdat$lost <- FALSE
plotdat$lost[lostindx] <- TRUE

ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = MSE, y = MSB/MSE, color = lost), stroke = 0, alpha = .5) +
  labs(title = "Looking at the lost cases... seem evenly distributed by MSE and MSB/MSE ratio") +
  geom_hline(aes(yintercept = cutoff_MSBMSE))

# Checked cluster variances, f tests, beta0, stderrs, no patterns
# 

ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = beta0, y = beta1, color = lost), stroke = 0, alpha = .5) +
  labs(title = "When beta1 estimate is around +/- .25?")

ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = pval_tt_uneq_var, y = MSB/MSE, color = lost), stroke = 0, alpha = .5) +
  labs(title = "When MSB/MSE and t-test p value is in that sliver?")

ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = pval_tt_uneq_var, y = MSB/MSE, color = lost), stroke = 0, alpha = .5) +
  xlim(c(0, .1))

  labs(title = "ZOOM IN - When MSB/MSE and t-test p value is in that sliver?")
## $title
## [1] "ZOOM IN - When MSB/MSE and t-test p value is in that sliver?"
## 
## attr(,"class")
## [1] "labels"
ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = pval_tt_eq_var, y = MSB/MSE, color = lost), stroke = 0, alpha = .5) +
  xlim(c(0, .1)) +
  labs(title = "Note: 'equal variance' t-test has a fuzzier border")

ggplot(data = plotdat[plotdat$tiny_est==T,]) +
  geom_point(aes(x = abs(beta1), y = MSB/MSE, color = lost), stroke = 0, alpha = .5) +
  labs(title = "A certain beta1 magnitude and a certain ratio?")